{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "GPU_id = 6\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "import cudf as gd\n",
    "import cupy as cp\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import xgboost as xgb\n",
    "import os\n",
    "import time\n",
    "import nvstrings\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Global"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "GPU_RUN_TIME = {}\n",
    "CPU_RUN_TIME = {}\n",
    "STEPS = []"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def on_gpu(words,func,arg=None,dtype=np.int32):\n",
    "    res = cp.array(words.size(), dtype=dtype)\n",
    "    if arg is None:\n",
    "        cmd = 'words.%s(res.device_ctypes_pointer.value)'%(func)\n",
    "    else:\n",
    "        cmd = 'words.%s(arg,res.device_ctypes_pointer.value)'%(func)\n",
    "    eval(cmd)\n",
    "    return res"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "if os.path.exists('../../cache')==False:\n",
    "    os.mkdir('../../cache')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/datasets/trivago/data/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "step = 'read csv'\n",
    "STEPS.append(step)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### pandas read csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train & test (15932992, 12) (3782335, 12)\n",
      "combined (19715327, 12)\n",
      "CPU times: user 28.6 s, sys: 6.12 s, total: 34.7 s\n",
      "Wall time: 34.7 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "start = time.time()\n",
    "train_pd = pd.read_csv('%s/train.csv'%path)\n",
    "test_pd = pd.read_csv('%s/test.csv'%path)\n",
    "submission_pd = pd.read_csv('%s/submission_popular.csv'%path)\n",
    "print(\"train & test\",train_pd.shape,test_pd.shape)\n",
    "data_pd = pd.concat([train_pd,test_pd])\n",
    "print('combined',data_pd.shape)\n",
    "CPU_RUN_TIME[step] = time.time() - start"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "del train_pd,test_pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Only keep click out rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "step = 'string comparsion and masking'\n",
    "STEPS.append(step)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### pandas string comparsion and masking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# of clickouts: 2115365\n",
      "true test (253573, 13)\n",
      "true test shape match submission shape\n",
      "CPU times: user 5.2 s, sys: 2.86 s, total: 8.05 s\n",
      "Wall time: 8.05 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "start = time.time()\n",
    "data_pd['is_click_out'] = data_pd['action_type']=='clickout item'\n",
    "data_pd = data_pd[data_pd['is_click_out']]\n",
    "\n",
    "data_pd.drop('is_click_out',axis=1,inplace=True)\n",
    "print(\"# of clickouts:\",data_pd.shape[0])\n",
    "data_pd['clickout_missing'] = data_pd['reference'].isnull()\n",
    "\n",
    "print('true test',data_pd[data_pd['clickout_missing']].shape)\n",
    "assert submission_pd.shape[0] == data_pd[data_pd['clickout_missing']].shape[0]\n",
    "print('true test shape match submission shape')\n",
    "CPU_RUN_TIME[step] = time.time() - start"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8 ms, sys: 0 ns, total: 8 ms\n",
      "Wall time: 6.5 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "data_pd['row_id'] = np.arange(data_pd.shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create recommendation list from `impressions`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "step = 'string column split & expand'\n",
    "STEPS.append(step)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### pandas string column split and expand"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 33.4 s, sys: 3.69 s, total: 37.1 s\n",
      "Wall time: 37.1 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "start = time.time()\n",
    "candidates_pd = data_pd['impressions'].str.split(\"|\", expand = True) \n",
    "prices_pd = data_pd['prices'].str.split(\"|\", expand = True) \n",
    "CPU_RUN_TIME[step] = time.time() - start"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 412 ms, sys: 136 ms, total: 548 ms\n",
      "Wall time: 544 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "data_pd.drop('impressions',axis=1,inplace=True)\n",
    "data_pd.drop('prices',axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Assign string columns to dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "step = 'assign string columns to dataframe'\n",
    "STEPS.append(step)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 13.5 s, sys: 2.1 s, total: 15.6 s\n",
      "Wall time: 15.6 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "start = time.time()\n",
    "data_pd_rec_list = data_pd[['row_id']]\n",
    "for i in range(candidates_pd.shape[1]):\n",
    "    data_pd_rec_list['item_%d'%i] = candidates_pd[i]\n",
    "    data_pd_rec_list['price_%d'%i] = prices_pd[i]\n",
    "data_pd_rec_list = data_pd_rec_list.set_index('row_id')\n",
    "CPU_RUN_TIME[step] = time.time() - start"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create data pair"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "step = 'create data pair'\n",
    "STEPS.append(step)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 19s, sys: 23.7 s, total: 1min 43s\n",
      "Wall time: 1min 43s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "start = time.time()\n",
    "\n",
    "cols = [i for i in data_pd_rec_list.columns if i.startswith('item_')]\n",
    "items = data_pd_rec_list[cols].stack().reset_index()\n",
    "items.columns = ['row_id','candidate_order','item_id']\n",
    "\n",
    "cols = [i for i in data_pd_rec_list.columns if i.startswith('price_')]\n",
    "prices = data_pd_rec_list[cols].stack().reset_index()\n",
    "prices.columns = ['row_id','candidate_order','price']\n",
    "\n",
    "items['price'] = prices['price'].astype(int)\n",
    "items['candidate_order'] = items['candidate_order'].apply(lambda x:x.split('_')[1]).astype(int)\n",
    "\n",
    "count = items['row_id'].value_counts()\n",
    "items['row_id_count'] = items['row_id'].map(count)\n",
    "items = items[items['row_id_count']>1]\n",
    "\n",
    "data_pd['clickout_missing'] = data_pd['clickout_missing'].astype(int)\n",
    "data_pair_pd = items.merge(data_pd,on='row_id',how='left')\n",
    "\n",
    "data_pair_pd['reference'] = data_pair_pd['reference'].fillna(-1).astype(int)\n",
    "data_pair_pd['item_id'] = data_pair_pd['item_id'].fillna(-1).astype(int)\n",
    "data_pair_pd['target'] = data_pair_pd['reference'] == data_pair_pd['item_id']\n",
    "data_pair_pd['target'] = data_pair_pd['target'].astype(int)\n",
    "\n",
    "CPU_RUN_TIME[step] = time.time() - start"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Save To Parquet"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Get Model Ready & Export to Parquet\n",
    "Take the current dataframe and export to training, validation, and test sets for processing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_pair_pd = data_pair_pd[data_pair_pd['clickout_missing']==0]\n",
    "test_pair_pd = data_pair_pd[data_pair_pd['clickout_missing']>0]\n",
    "train_pair_pd['is_va'] = train_pair_pd.row_id%5 == 0\n",
    "train_pair = train_pair_pd[train_pair_pd['is_va']==0]\n",
    "valid_pair = train_pair_pd[train_pair_pd['is_va']>0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_pair = train_pair.drop(columns=['is_va'])\n",
    "valid_pair = valid_pair.drop(columns=['is_va'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(8551343, 18)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_pair.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for Fastai version\n",
    "data_pair_pd.to_parquet('../../cache/data_pair.parquet')\n",
    "# for Tensor version\n",
    "valid_pair.to_parquet('../../cache/valid.parquet')\n",
    "train_pair.to_parquet('../../cache/train.parquet')\n",
    "test_pair_pd.to_parquet('../../cache/test.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize the timing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "CPU_RUN_TIME['Overall'] = sum([CPU_RUN_TIME[i] for i in STEPS])\n",
    "STEPS.append('Overall')\n",
    "\n",
    "timing = pd.DataFrame()\n",
    "timing['step'] = STEPS\n",
    "timing['CPU'] = [CPU_RUN_TIME[i] for i in STEPS]\n",
    "timing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
